Setup

# Setup file locations to import 
beer_csv_loc <- "./StartingDBs/Beers.csv"
breweries_csv_loc <- "./StartingDBs/Breweries.csv"

# Read in data 
beer_data <- read.csv(beer_csv_loc, header = TRUE)
brewery_data <- read.csv(breweries_csv_loc, header = TRUE)

Case study Questions

1. How many breweries are present in each state?

# Create a seperate data frame to store state data 
### Note: This data set counts the district of columbia as a state
num_of_breweries_by_state <- data.frame(table(brewery_data$State))

2. Merge beer data with the breweries data. Print the first 6 observations and

#the last six observations to check the merged file

# Change column name of beer_data to match brewery_data to use as a primary key
colnames(beer_data)[5] <- "Brew_ID"

# Merge two data bases using merge
full_brew_data <- merge(brewery_data, beer_data, by="Brew_ID")

# Rename columns 2 and 5 that were changed during the merge
colnames(full_brew_data)[2] <- "Brewery"
colnames(full_brew_data)[5] <- "Beer_Name"

# Print first 6 and last 6 observations 
head(full_brew_data, n=6)
##   Brew_ID            Brewery        City State     Beer_Name Beer_ID   ABV IBU
## 1       1 NorthGate Brewing  Minneapolis    MN       Pumpion    2689 0.060  38
## 2       1 NorthGate Brewing  Minneapolis    MN    Stronghold    2688 0.060  25
## 3       1 NorthGate Brewing  Minneapolis    MN   Parapet ESB    2687 0.056  47
## 4       1 NorthGate Brewing  Minneapolis    MN  Get Together    2692 0.045  50
## 5       1 NorthGate Brewing  Minneapolis    MN Maggie's Leap    2691 0.049  26
## 6       1 NorthGate Brewing  Minneapolis    MN    Wall's End    2690 0.048  19
##                                 Style Ounces
## 1                         Pumpkin Ale     16
## 2                     American Porter     16
## 3 Extra Special / Strong Bitter (ESB)     16
## 4                        American IPA     16
## 5                  Milk / Sweet Stout     16
## 6                   English Brown Ale     16
tail(full_brew_data, n=6)
##      Brew_ID                       Brewery          City State
## 2405     556         Ukiah Brewing Company         Ukiah    CA
## 2406     557       Butternuts Beer and Ale Garrattsville    NY
## 2407     557       Butternuts Beer and Ale Garrattsville    NY
## 2408     557       Butternuts Beer and Ale Garrattsville    NY
## 2409     557       Butternuts Beer and Ale Garrattsville    NY
## 2410     558 Sleeping Lady Brewing Company     Anchorage    AK
##                      Beer_Name Beer_ID   ABV IBU                   Style Ounces
## 2405             Pilsner Ukiah      98 0.055  NA         German Pilsener     12
## 2406         Porkslap Pale Ale      49 0.043  NA American Pale Ale (APA)     12
## 2407           Snapperhead IPA      51 0.068  NA            American IPA     12
## 2408         Moo Thunder Stout      50 0.049  NA      Milk / Sweet Stout     12
## 2409  Heinnieweisse Weissebier      52 0.049  NA              Hefeweizen     12
## 2410 Urban Wilderness Pale Ale      30 0.049  NA        English Pale Ale     12

3. Address any values

# Create two new tables where one has only ABV with no <NA>'s and the other 
state_abv <- data.frame(State=full_brew_data$State, ABV=full_brew_data$ABV)
state_abv <- state_abv %>% drop_na(ABV)

# Do the same for IBU 
state_ibu <- data.frame(State=full_brew_data$State, IBU=full_brew_data$IBU)
state_ibu <- state_ibu %>% drop_na(IBU)

4. Compute the median alcohol content and international bitterness unit for each

##state. Plot a bar chart

# Find the average ABV for the state_abv table 
avg_state_abv <- ddply(state_abv, .(State), function(x) mean(x$ABV))
colnames(avg_state_abv)[2] <- "Average_ABV" # Rename the column
# Drop DC. It's not a state....yet
avg_state_abv <- avg_state_abv[-c(8),] # DC is equal to position 8 on the table 
# Order the states by Decending ABV content 
avg_state_abv <- avg_state_abv[order(avg_state_abv$Average_ABV, decreasing=TRUE),]

# Find Average IBU
avg_state_ibu <- ddply(state_ibu, .(State), function(x) mean(x$IBU))
colnames(avg_state_ibu)[2] <- "Average_IBU" # Rename column
# Order the states by IBU Decending 
avg_state_ibu <- avg_state_ibu[order(avg_state_ibu$Average_IBU, decreasing=TRUE),]

# Plot Average Alcohol by Volume per state
ggplot(avg_state_abv, aes(x=reorder(State, desc(Average_ABV)), y=Average_ABV, fill=State)) + 
       geom_col(show.legend = FALSE, width=.9, position="dodge") +
       xlab("State") + 
       ylab("Average Alcohol by Volume") 

# Plot Average International Bitterness Unit per State 
ggplot(avg_state_ibu, aes(x=reorder(State, desc(Average_IBU)), y=Average_IBU, fill=State)) + 
       geom_col(show.legend = FALSE, width=.9, position="dodge") +
       xlab("State") + 
       ylab("Average IBU")

5. Which state has the the Max ABV. Which has the

# Get the State with the highest average ABV
state_max_abv <- avg_state_abv[which.max(avg_state_abv$Average_ABV),]
# Get the State with the highest average IBU
state_max_ibu <- avg_state_ibu[which.max(avg_state_ibu$Average_IBU),]

6. Comment on the Summary stats and distribution of the ABV variable

summary(state_abv)
##     State                ABV         
##  Length:2348        Min.   :0.00100  
##  Class :character   1st Qu.:0.05000  
##  Mode  :character   Median :0.05600  
##                     Mean   :0.05977  
##                     3rd Qu.:0.06700  
##                     Max.   :0.12800
ggplot(state_abv, aes(x=ABV)) + geom_histogram() # Right Skewness
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content?

# Create a new data frame from the beer data that removes rows that have <NA> in 
# the ABV and IBU columns
beer_data_no_na <- beer_data
beer_data_no_na <- beer_data_no_na %>% drop_na(ABV)
beer_data_no_na <- beer_data_no_na %>% drop_na(IBU)

# Scatter plot to show raw data 
ggplot(beer_data_no_na, aes(x=ABV, y=IBU, colour=Style)) + 
       geom_point(show.legend = FALSE) + 
       xlab("ABV in %") + 
       ylab("IBU")

# Plot a smooth curve to see a more "linear" pattern 
ggplot(beer_data_no_na, aes(x=ABV, y=IBU)) + 
       geom_smooth() +
       xlab("ABV in %") + 
       ylab("IBU")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

8. investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales)

#and other types of Ale. Use KNN.

# Create a data frame with only IPA's
#df1[grep("dog",df1$type),]
ipa_beers <- beer_data_no_na[grep("IPA",beer_data_no_na$Style),]
# Change the Sytle column to only have the value of IPA for easier model training
ipa_beers$Style <- "IPA"

ales_beer <- beer_data_no_na[grep("Ale",beer_data_no_na$Style),]
# Change the Sytle column to only have the value of Ale for easier model training
ales_beer$Style <- "Ale"

# Combine the two data sets 
ipas_and_ales <- rbind(ipa_beers, ales_beer)

# plot the data to get an initial view of the data
ggplot(ipas_and_ales, aes(x=ABV, y=IBU, colour=Style)) + geom_point()

# Create train and test data frames
ipas_and_ales_train <- ipas_and_ales # train data needs to be a duplicate of orig
ipas_and_ales_test <- data.frame()

# Split 70/30 (Train/Test)
samples_to_take <- round(as.numeric(nrow(ipas_and_ales_train)*.3), digits=0)
for (row in 1:samples_to_take) {
  tmp_row <- sample_n(ipas_and_ales_train, 1) # Take sample
  # Add Sample to test data
  ipas_and_ales_test <- ipas_and_ales_test %>% rbind(tmp_row)
  # Remove Sample from train data
  ipas_and_ales_train <- ipas_and_ales_train[!(ipas_and_ales_train$Beer_ID==tmp_row$Beer_ID),]
}

# find an optimal K value for the KNN model
style_accuracy <- data.frame(accuract=numeric(50), k=numeric(50))
for (iter in 1:50) {
  style_class <- knn(ipas_and_ales_train[,c(3,4)], ipas_and_ales_test[c(3,4)],
                     ipas_and_ales_train$Style,
                     prob=TRUE, k=iter)
  table(ipas_and_ales_test$Style, style_class)
  cm <- confusionMatrix(table(ipas_and_ales_test$Style, style_class))
  style_accuracy$accuracy[iter] <- cm$overall[1]
  style_accuracy$k[iter] <- iter
}
#plot(style_accuracy$k, style_accuracy$accuracy, type="l", xlab="k")
plot_ly(style_accuracy, x=style_accuracy$k, y=style_accuracy$accuracy, 
        type="scatter", mode="lines")
##
## On average this loop gives me an optimal k level of k=5,6,7
##